{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/hamzafarooq/building-llm-applications-from-scratch/blob/main/Module%201/Module_1_Foundation_LLM_Maven_v2_module_1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2I7DweFkH0xG"
      },
      "outputs": [],
      "source": [
        "!pip install nltk --quiet"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# code courtesy of https://nlpforhackers.io/language-models/\n",
        "# Import necessary libraries\n",
        "import nltk\n",
        "from nltk.corpus import reuters\n",
        "from nltk import bigrams, trigrams\n",
        "from collections import Counter, defaultdict\n",
        "\n",
        "# Download required NLTK data\n",
        "nltk.download('reuters')\n",
        "nltk.download('punkt')\n",
        "\n",
        "# Create a placeholder for the language model\n",
        "# Using nested defaultdict to automatically handle new keys\n",
        "model = defaultdict(lambda: defaultdict(lambda: 0))\n",
        "\n",
        "# Build the trigram model\n",
        "for sentence in reuters.sents():\n",
        "    # Generate trigrams from each sentence\n",
        "    # pad_right and pad_left add None at the beginning and end of the sentence\n",
        "    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):\n",
        "        # Increment the count for this trigram\n",
        "        model[(w1, w2)][w3] += 1\n",
        "\n",
        "# Convert frequency counts to probabilities\n",
        "for w1_w2 in model:\n",
        "    # Calculate total count for this bigram\n",
        "    total_count = float(sum(model[w1_w2].values()))\n",
        "\n",
        "    # Convert each count to a probability\n",
        "    for w3 in model[w1_w2]:\n",
        "        model[w1_w2][w3] /= total_count\n",
        "\n",
        "# At this point, model contains the probabilities of each word (w3)\n",
        "# given the previous two words (w1, w2)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5snK0qDwH6pP",
        "outputId": "13567071-2c54-43db-ac85-49d9e5dc5df4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package reuters to /root/nltk_data...\n",
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "This code creates a trigram language model using the Reuters corpus. Here's a breakdown of what it does:\n",
        "\n",
        "*   It imports necessary libraries and downloads required NLTK data.\n",
        "*   It creates a nested defaultdict to store the model. This allows for easy counting of trigrams without explicitly checking if a key exists.\n",
        "*   It iterates through all sentences in the Reuters corpus, generating trigrams for each sentence.\n",
        "*   For each trigram, it increments the count in the model.\n",
        "*   After counting all trigrams, it converts the counts to probabilities by dividing each count by the total count for its corresponding bigram.\n",
        "\n",
        "\n",
        "The resulting model can be used to predict the probability of a word given the two preceding words."
      ],
      "metadata": {
        "id": "STmEwY9cp9Ht"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Get the probabilities for words following \"the price\" and sort them\n",
        "# This creates a list of (word, probability) tuples, sorted by probability in descending order\n",
        "sorted_probabilities = sorted(dict(model[\"the\", \"price\"]).items(), key=lambda x: x[1], reverse=True)\n",
        "\n",
        "# Print the sorted probabilities\n",
        "print(\"Most probable words following 'the price', in order:\")\n",
        "for word, prob in sorted_probabilities:\n",
        "    print(f\"{word}: {prob}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5-prAi17J5Fa",
        "outputId": "28c38465-85a9-42aa-f6de-acdb32c768be"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Most probable words following 'the price', in order:\n",
            "of: 0.3209302325581395\n",
            "it: 0.05581395348837209\n",
            "to: 0.05581395348837209\n",
            "for: 0.05116279069767442\n",
            ".: 0.023255813953488372\n",
            "at: 0.023255813953488372\n",
            "adjustment: 0.023255813953488372\n",
            "is: 0.018604651162790697\n",
            ",: 0.018604651162790697\n",
            "paid: 0.013953488372093023\n",
            "increases: 0.013953488372093023\n",
            "per: 0.013953488372093023\n",
            "the: 0.013953488372093023\n",
            "will: 0.013953488372093023\n",
            "cut: 0.009302325581395349\n",
            "cuts: 0.009302325581395349\n",
            "(: 0.009302325581395349\n",
            "differentials: 0.009302325581395349\n",
            "has: 0.009302325581395349\n",
            "stayed: 0.009302325581395349\n",
            "was: 0.009302325581395349\n",
            "freeze: 0.009302325581395349\n",
            "increase: 0.009302325581395349\n",
            "would: 0.009302325581395349\n",
            "yesterday: 0.004651162790697674\n",
            "effect: 0.004651162790697674\n",
            "used: 0.004651162790697674\n",
            "climate: 0.004651162790697674\n",
            "reductions: 0.004651162790697674\n",
            "limit: 0.004651162790697674\n",
            "now: 0.004651162790697674\n",
            "moved: 0.004651162790697674\n",
            "adjustments: 0.004651162790697674\n",
            "slumped: 0.004651162790697674\n",
            "move: 0.004651162790697674\n",
            "evolution: 0.004651162790697674\n",
            "went: 0.004651162790697674\n",
            "factor: 0.004651162790697674\n",
            "Royal: 0.004651162790697674\n",
            "again: 0.004651162790697674\n",
            "changes: 0.004651162790697674\n",
            "holds: 0.004651162790697674\n",
            "fall: 0.004651162790697674\n",
            "-: 0.004651162790697674\n",
            "from: 0.004651162790697674\n",
            "base: 0.004651162790697674\n",
            "on: 0.004651162790697674\n",
            "review: 0.004651162790697674\n",
            "while: 0.004651162790697674\n",
            "collapse: 0.004651162790697674\n",
            "being: 0.004651162790697674\n",
            "outlook: 0.004651162790697674\n",
            "rises: 0.004651162790697674\n",
            "drop: 0.004651162790697674\n",
            "guaranteed: 0.004651162790697674\n",
            ",\": 0.004651162790697674\n",
            "structure: 0.004651162790697674\n",
            "and: 0.004651162790697674\n",
            "could: 0.004651162790697674\n",
            "related: 0.004651162790697674\n",
            "hike: 0.004651162790697674\n",
            "we: 0.004651162790697674\n",
            "policy: 0.004651162790697674\n",
            "revision: 0.004651162790697674\n",
            "led: 0.004651162790697674\n",
            "action: 0.004651162790697674\n",
            "zone: 0.004651162790697674\n",
            "slump: 0.004651162790697674\n",
            "had: 0.004651162790697674\n",
            "difference: 0.004651162790697674\n",
            "in: 0.004651162790697674\n",
            "raise: 0.004651162790697674\n",
            "support: 0.004651162790697674\n",
            "gap: 0.004651162790697674\n",
            "projected: 0.004651162790697674\n",
            "approached: 0.004651162790697674\n",
            "instability: 0.004651162790697674\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install pytorch-transformers --quiet"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "L-tDCIQWJLmW",
        "outputId": "7717b892-9a75-4e74-b7ad-291234531e86"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/176.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.4/176.4 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m897.5/897.5 kB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.5/12.5 MB\u001b[0m \u001b[31m23.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.7/82.7 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Import required libraries\n",
        "import torch\n",
        "from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
        "\n",
        "# Load pre-trained model tokenizer (vocabulary)\n",
        "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "DCdvvrRJKUtA",
        "outputId": "a9e683ec-1eca-4520-c1d4-eaf143afdb12"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 1042301/1042301 [00:00<00:00, 7644042.46B/s]\n",
            "100%|██████████| 456318/456318 [00:00<00:00, 5700107.25B/s]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the input text\n",
        "text = \"I am thinking\"\n",
        "print(f\"Input text: {text}\")\n",
        "\n",
        "# Encode the input text\n",
        "indexed_tokens = tokenizer.encode(text)\n",
        "\n",
        "# Convert indexed tokens to a PyTorch tensor\n",
        "tokens_tensor = torch.tensor([indexed_tokens])\n",
        "\n",
        "# Load pre-trained model (weights)\n",
        "model = GPT2LMHeadModel.from_pretrained('gpt2')\n",
        "\n",
        "# Set the model in evaluation mode to deactivate the DropOut modules\n",
        "model.eval()\n",
        "\n",
        "# Check if CUDA is available and move model and tensors to GPU if possible\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "tokens_tensor = tokens_tensor.to(device)\n",
        "model.to(device)\n",
        "\n",
        "print(f\"Using device: {device}\")\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "V6xBHpdMsBHU",
        "outputId": "1dcbcb0e-fa06-4a9a-a5bb-f80de12737f2"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Input text: I am thinking\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 665/665 [00:00<00:00, 420568.78B/s]\n",
            "100%|██████████| 548118077/548118077 [00:19<00:00, 28242813.65B/s]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Predict next token\n",
        "with torch.no_grad():\n",
        "    outputs = model(tokens_tensor)\n",
        "    predictions = outputs[0]\n",
        "\n",
        "# Get the predicted next sub-word (token)\n",
        "predicted_index = torch.argmax(predictions[0, -1, :]).item()\n",
        "predicted_token = tokenizer.decode([predicted_index])\n",
        "\n",
        "# Add the predicted token to the original text\n",
        "predicted_text = text + predicted_token\n",
        "\n",
        "# Print the results\n",
        "print(f\"Predicted next token: '{predicted_token}'\")\n",
        "print(f\"Complete predicted text: '{predicted_text}'\")"
      ],
      "metadata": {
        "id": "a2uBQHYssUCQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Assignment -- Extend the code above to a sentence"
      ],
      "metadata": {
        "id": "FPDNGoo9OLis"
      }
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "kK6GIh5NOPKv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Solution\n"
      ],
      "metadata": {
        "id": "cWN1ccXBryOO"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Optional: Generate multiple next tokens\n",
        "num_tokens_to_generate = 10\n",
        "generated_text = text\n",
        "\n",
        "for _ in range(num_tokens_to_generate):\n",
        "    # Encode all text generated so far\n",
        "    indexed_tokens = tokenizer.encode(generated_text)\n",
        "    tokens_tensor = torch.tensor([indexed_tokens]).to(device)\n",
        "\n",
        "    # Predict next token\n",
        "    with torch.no_grad():\n",
        "        outputs = model(tokens_tensor)\n",
        "        predictions = outputs[0]\n",
        "\n",
        "    # Get the predicted next token\n",
        "    predicted_index = torch.argmax(predictions[0, -1, :]).item()\n",
        "    predicted_token = tokenizer.decode([predicted_index])\n",
        "\n",
        "    # Add the predicted token to the generated text\n",
        "    generated_text += predicted_token\n",
        "\n",
        "print(f\"\\nGenerated text with {num_tokens_to_generate} additional tokens:\")\n",
        "print(generated_text)"
      ],
      "metadata": {
        "id": "0VsRa-N-r1pC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "-00Xu4A1slQS"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}